# Import required libraries and pandas to read the raw data csv file to a dataframe
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# Read the consolidated csv file into pandas dataframe
RAW_DATASET = pd.read_csv('data/ConsolidatedDataV2.csv')
# Print the number of rows and columns in the raw dataset
RAW_DATASET.shape
(2856, 30)
# Print the first 6 rows of the raw dataset
RAW_DATASET.head()
| Country | Year | Life expectancy | BMI | ChildMalnutrition | Cholera | Alcohol | HIV | BCG | Adult Mortality | ... | Medical Expenditure | Retirement Age | Diphtheria | Suicides | NCD | Env Pollution | HepatitisB | Measles | Polio | Tuberculosis | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Albania | 2011 | 76.914 | 26.2 | 22.9 | NaN | 5.03 | 100.0 | 97.0 | 103 | ... | 4.795327 | NaN | 99.0 | 7.6 | 490.2 | 0.5 | 99.0 | 99.0 | 99.0 | 9.29 |
| 1 | Albania | 2012 | 77.252 | 26.3 | 23.1 | NaN | 4.43 | 100.0 | 96.0 | 103 | ... | 5.055262 | NaN | 99.0 | 4.8 | 507.1 | 0.4 | 99.0 | 99.0 | 99.0 | 9.29 |
| 2 | Albania | 2013 | 77.554 | 26.4 | 23.6 | NaN | 4.28 | 100.0 | 99.0 | 100 | ... | 5.385599 | NaN | 99.0 | 4.8 | 486.4 | 0.4 | 99.0 | 99.0 | 99.0 | 8.29 |
| 3 | Algeria | 2014 | 75.878 | 25.4 | 33.9 | NaN | 0.54 | 200.0 | 99.0 | 98 | ... | 6.547214 | NaN | 95.0 | 2.8 | 464.4 | 0.8 | 95.0 | 99.0 | 95.0 | 3107.90 |
| 4 | Algeria | 2015 | 76.090 | 25.5 | 33.9 | NaN | 0.55 | 200.0 | 99.0 | 96 | ... | 6.978492 | NaN | 95.0 | 2.7 | 460.7 | 0.7 | 95.0 | 99.0 | 95.0 | 3208.10 |
5 rows × 30 columns
# Print the last 6 rows of the raw dataset
RAW_DATASET.tail()
| Country | Year | Life expectancy | BMI | ChildMalnutrition | Cholera | Alcohol | HIV | BCG | Adult Mortality | ... | Medical Expenditure | Retirement Age | Diphtheria | Suicides | NCD | Env Pollution | HepatitisB | Measles | Polio | Tuberculosis | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2851 | Zimbabwe | 2011 | 52.896 | 23.7 | 48.3 | 2.0 | 3.91 | 40000.0 | 98.0 | 440 | ... | 8.081738 | NaN | 93.0 | 34.3 | 842.2 | 4.6 | 94.0 | NaN | 93.0 | 1512.0 |
| 2852 | Zimbabwe | 2012 | 55.032 | 23.7 | 46.2 | 1.0 | 3.93 | 33000.0 | 98.0 | 407 | ... | 6.918353 | NaN | 95.0 | 33.1 | 826.4 | 4.5 | 97.0 | NaN | 95.0 | 1612.0 |
| 2853 | Zimbabwe | 2013 | 56.897 | 23.7 | 44.3 | NaN | 4.11 | 28000.0 | 95.0 | 383 | ... | 7.110148 | NaN | 95.0 | 31.4 | 810.2 | 4.5 | 95.0 | NaN | 95.0 | 1209.3 |
| 2854 | Zimbabwe | 2014 | 58.410 | 23.8 | 42.8 | NaN | 4.22 | 25000.0 | 99.0 | 358 | ... | 8.133524 | NaN | 91.0 | 30.8 | 804.3 | 4.4 | 91.0 | NaN | 92.0 | 1410.0 |
| 2855 | Zimbabwe | 2015 | 59.534 | 23.8 | 41.7 | 0.0 | 3.84 | 24000.0 | 90.0 | 346 | ... | 7.452066 | NaN | 87.0 | 30.7 | 800.1 | 4.3 | 87.0 | NaN | 88.0 | 1107.9 |
5 rows × 30 columns
# Generate descriptive statistics on raw dataset
RAW_DATASET.describe()
| Year | Life expectancy | BMI | ChildMalnutrition | Cholera | Alcohol | HIV | BCG | Adult Mortality | ChildMortality | ... | Medical Expenditure | Retirement Age | Diphtheria | Suicides | NCD | Env Pollution | HepatitisB | Measles | Polio | Tuberculosis | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2856.000000 | 2567.000000 | 2856.000000 | 2822.000000 | 781.000000 | 2800.000000 | 2040.000000 | 2344.000000 | 2856.000000 | 2.856000e+03 | ... | 2458.000000 | 782.000000 | 2837.000000 | 2856.000000 | 2856.000000 | 2856.000000 | 2288.000000 | 1629.000000 | 2820.000000 | 2799.000000 |
| mean | 2008.000000 | 68.795037 | 24.562185 | 36.663466 | 83.691421 | 4.814093 | 8918.289216 | 89.593003 | 194.384804 | 6.725880e+04 | ... | 6.079373 | 66.125614 | 86.272823 | 11.992017 | 605.322024 | 1.498704 | 85.366696 | 84.666053 | 86.427660 | 8995.936395 |
| std | 4.899837 | 9.863356 | 3.390756 | 21.199494 | 301.232338 | 3.939412 | 24522.502650 | 13.897708 | 116.830456 | 2.234386e+05 | ... | 2.435629 | 9.043176 | 15.585910 | 9.989647 | 186.165083 | 1.588911 | 17.253674 | 18.385550 | 15.209758 | 43737.466927 |
| min | 2000.000000 | 39.441000 | 0.000000 | 5.800000 | 0.000000 | 0.000000 | 100.000000 | 16.000000 | 49.000000 | 1.800000e+01 | ... | 1.025159 | 57.035300 | 19.000000 | 0.100000 | 240.400000 | 0.000000 | 2.000000 | 2.000000 | 8.000000 | 0.000000 |
| 25% | 2004.000000 | 61.433500 | 23.000000 | 19.500000 | 0.000000 | 1.240000 | 200.000000 | 87.000000 | 104.000000 | 1.062500e+03 | ... | 4.152656 | 61.876000 | 82.000000 | 5.800000 | 456.575000 | 0.300000 | 81.000000 | 81.000000 | 82.000000 | 67.050000 |
| 50% | 2008.000000 | 71.710000 | 25.400000 | 30.400000 | 4.000000 | 4.015000 | 975.000000 | 95.000000 | 164.000000 | 8.016500e+03 | ... | 5.835874 | 64.249950 | 93.000000 | 10.000000 | 604.500000 | 0.800000 | 92.000000 | 92.000000 | 93.000000 | 646.000000 |
| 75% | 2012.000000 | 76.129000 | 26.400000 | 52.475000 | 38.000000 | 7.752500 | 5600.000000 | 98.000000 | 260.000000 | 5.138925e+04 | ... | 7.939192 | 67.866425 | 97.000000 | 15.000000 | 725.600000 | 2.400000 | 96.000000 | 97.000000 | 97.000000 | 3107.800000 |
| max | 2016.000000 | 84.090000 | 29.600000 | 88.400000 | 3990.000000 | 17.870000 | 290000.000000 | 99.000000 | 697.000000 | 2.025425e+06 | ... | 20.413412 | 126.126400 | 99.000000 | 116.200000 | 1317.700000 | 9.400000 | 99.000000 | 99.000000 | 99.000000 | 615058.000000 |
8 rows × 29 columns
# Print the data types of the columns in raw dataset
RAW_DATASET.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2856 entries, 0 to 2855 Data columns (total 30 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Country 2856 non-null object 1 Year 2856 non-null int64 2 Life expectancy 2567 non-null float64 3 BMI 2856 non-null float64 4 ChildMalnutrition 2822 non-null float64 5 Cholera 781 non-null float64 6 Alcohol 2800 non-null float64 7 HIV 2040 non-null float64 8 BCG 2344 non-null float64 9 Adult Mortality 2856 non-null int64 10 ChildMortality 2856 non-null int64 11 Population 2528 non-null float64 12 Eggs Consumption 2339 non-null float64 13 Bovine Meat 2339 non-null float64 14 Mutton & Goat meat 2339 non-null float64 15 Other Meat 2339 non-null float64 16 Pig Meat 2273 non-null float64 17 Poultry Meat 2339 non-null float64 18 Milk Consumption 2339 non-null float64 19 Fish and Seafood 2339 non-null float64 20 Medical Expenditure 2458 non-null float64 21 Retirement Age 782 non-null float64 22 Diphtheria 2837 non-null float64 23 Suicides 2856 non-null float64 24 NCD 2856 non-null float64 25 Env Pollution 2856 non-null float64 26 HepatitisB 2288 non-null float64 27 Measles 1629 non-null float64 28 Polio 2820 non-null float64 29 Tuberculosis 2799 non-null float64 dtypes: float64(26), int64(3), object(1) memory usage: 669.5+ KB
# Creatine new dataframe called "dataset" to process and store the raw data
dataset = RAW_DATASET
# The dataset consists of “Year” column which is time series data.
# TODO : Change data type of "Year" column to datetime
dataset['Year'] = pd.to_datetime(dataset['Year'] , format='%Y', errors='ignore')
dataset.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2856 entries, 0 to 2855 Data columns (total 30 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Country 2856 non-null object 1 Year 2856 non-null datetime64[ns] 2 Life expectancy 2567 non-null float64 3 BMI 2856 non-null float64 4 ChildMalnutrition 2822 non-null float64 5 Cholera 781 non-null float64 6 Alcohol 2800 non-null float64 7 HIV 2040 non-null float64 8 BCG 2344 non-null float64 9 Adult Mortality 2856 non-null int64 10 ChildMortality 2856 non-null int64 11 Population 2528 non-null float64 12 Eggs Consumption 2339 non-null float64 13 Bovine Meat 2339 non-null float64 14 Mutton & Goat meat 2339 non-null float64 15 Other Meat 2339 non-null float64 16 Pig Meat 2273 non-null float64 17 Poultry Meat 2339 non-null float64 18 Milk Consumption 2339 non-null float64 19 Fish and Seafood 2339 non-null float64 20 Medical Expenditure 2458 non-null float64 21 Retirement Age 782 non-null float64 22 Diphtheria 2837 non-null float64 23 Suicides 2856 non-null float64 24 NCD 2856 non-null float64 25 Env Pollution 2856 non-null float64 26 HepatitisB 2288 non-null float64 27 Measles 1629 non-null float64 28 Polio 2820 non-null float64 29 Tuberculosis 2799 non-null float64 dtypes: datetime64[ns](1), float64(26), int64(2), object(1) memory usage: 669.5+ KB
# Print the first 6 rows of the dataset
dataset.head()
| Country | Year | Life expectancy | BMI | ChildMalnutrition | Cholera | Alcohol | HIV | BCG | Adult Mortality | ... | Medical Expenditure | Retirement Age | Diphtheria | Suicides | NCD | Env Pollution | HepatitisB | Measles | Polio | Tuberculosis | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Albania | 2011-01-01 | 76.914 | 26.2 | 22.9 | NaN | 5.03 | 100.0 | 97.0 | 103 | ... | 4.795327 | NaN | 99.0 | 7.6 | 490.2 | 0.5 | 99.0 | 99.0 | 99.0 | 9.29 |
| 1 | Albania | 2012-01-01 | 77.252 | 26.3 | 23.1 | NaN | 4.43 | 100.0 | 96.0 | 103 | ... | 5.055262 | NaN | 99.0 | 4.8 | 507.1 | 0.4 | 99.0 | 99.0 | 99.0 | 9.29 |
| 2 | Albania | 2013-01-01 | 77.554 | 26.4 | 23.6 | NaN | 4.28 | 100.0 | 99.0 | 100 | ... | 5.385599 | NaN | 99.0 | 4.8 | 486.4 | 0.4 | 99.0 | 99.0 | 99.0 | 8.29 |
| 3 | Algeria | 2014-01-01 | 75.878 | 25.4 | 33.9 | NaN | 0.54 | 200.0 | 99.0 | 98 | ... | 6.547214 | NaN | 95.0 | 2.8 | 464.4 | 0.8 | 95.0 | 99.0 | 95.0 | 3107.90 |
| 4 | Algeria | 2015-01-01 | 76.090 | 25.5 | 33.9 | NaN | 0.55 | 200.0 | 99.0 | 96 | ... | 6.978492 | NaN | 95.0 | 2.7 | 460.7 | 0.7 | 95.0 | 99.0 | 95.0 | 3208.10 |
5 rows × 30 columns
# Total columns = 30
# Total rows = 2856
# TODO : check the count of missing values by columns in descending order
dataset.isnull().sum().sort_values(ascending=False)
Cholera 2075 Retirement Age 2074 Measles 1227 HIV 816 Pig Meat 583 HepatitisB 568 Eggs Consumption 517 Milk Consumption 517 Mutton & Goat meat 517 Bovine Meat 517 Other Meat 517 Fish and Seafood 517 Poultry Meat 517 BCG 512 Medical Expenditure 398 Population 328 Life expectancy 289 Tuberculosis 57 Alcohol 56 Polio 36 ChildMalnutrition 34 Diphtheria 19 Year 0 ChildMortality 0 Adult Mortality 0 Suicides 0 NCD 0 Env Pollution 0 BMI 0 Country 0 dtype: int64
# TODO : Drop the columns with more than 50% of missing data
# Cholera 2075
# Retirement Age 2074
# Measles 1227
# Drop the cholera column more than 50% missing data
dataset = dataset.drop(['Cholera'],axis=1)
# Drop the retitrement column
dataset = dataset.drop(['Retirement Age'],axis=1)
# Drop the Measles column
dataset = dataset.drop(['Measles'],axis=1)
# Life Expectancy is the main focus of our research
# Imputing introduces bias which is not desirable . Hence, we delete this null rows
# Life expectancy 289
# Filter the rows with null values "Life expectancy" column of dataset
dataset=dataset[dataset['Life expectancy'].notna()]
# Convert datatype of life expectancy to int after removing null values
# To apply classification algorithms
dataset['Life expectancy'] = dataset['Life expectancy'].astype(np.int64)
# TODO : check the count of missing values by columns in descending order after removing some columns and rows
dataset.isnull().sum().sort_values(ascending=False)
HIV 731 HepatitisB 522 BCG 476 Pig Meat 294 Mutton & Goat meat 228 Other Meat 228 Fish and Seafood 228 Milk Consumption 228 Poultry Meat 228 Eggs Consumption 228 Bovine Meat 228 Medical Expenditure 187 Population 124 Alcohol 56 Tuberculosis 38 ChildMalnutrition 34 Polio 34 Diphtheria 17 Year 0 ChildMortality 0 Adult Mortality 0 BMI 0 Suicides 0 NCD 0 Env Pollution 0 Life expectancy 0 Country 0 dtype: int64
# TODO : Drop only the rows with missing values which are less than 10%
# Pig Meat 294
# Mutton & Goat meat 228
# Other Meat 228
# Fish and Seafood 228
# Milk Consumption 228
# Poultry Meat 228
# Eggs Consumption 228
# Bovine Meat 228
# Medical Expenditure 187
# Population 124
# Alcohol 56
# Tuberculosis 38
# ChildMalnutrition 34
# Polio 34
# Diphtheria 17
dataset=dataset[dataset['Pig Meat'].notna()]
dataset=dataset[dataset['Mutton & Goat meat'].notna()]
dataset=dataset[dataset['Other Meat'].notna()]
dataset=dataset[dataset['Fish and Seafood'].notna()]
dataset=dataset[dataset['Milk Consumption'].notna()]
dataset=dataset[dataset['Poultry Meat'].notna()]
dataset=dataset[dataset['Eggs Consumption'].notna()]
dataset=dataset[dataset['Bovine Meat'].notna()]
dataset=dataset[dataset['Medical Expenditure'].notna()]
dataset=dataset[dataset['Population'].notna()]
dataset=dataset[dataset['Alcohol'].notna()]
dataset=dataset[dataset['Tuberculosis'].notna()]
dataset=dataset[dataset['ChildMalnutrition'].notna()]
dataset=dataset[dataset['Polio'].notna()]
dataset=dataset[dataset['Diphtheria'].notna()]
# TODO : Replace the missing values with mean value
# missing data is about 25% and deletion causes too much data loss from the dataset ( rows decrease drastically)
# HIV 731
# HepatitisB 522
# BCG 476
dataset['HIV']=dataset['HIV'].fillna(value=dataset['HIV'].mean())
dataset['BCG']=dataset['BCG'].fillna(value=dataset['BCG'].mean())
dataset['HepatitisB']=dataset['HepatitisB'].fillna(value=dataset['HepatitisB'].mean())
# Ignore and supress the warnings thrown by dataprep library
import warnings
warnings.filterwarnings('ignore')
# Import the dataprep library
import dataprep
from dataprep.eda import create_report
# Generate the visual report called "pre_processed_report" using create_report method
pre_processed_report = create_report(dataset, title='Pre-Processed Dataset')
pre_processed_report
NumExpr defaulting to 8 threads.
| Number of Variables | 27 |
|---|---|
| Number of Rows | 2074 |
| Missing Cells | 0 |
| Missing Cells (%) | 0.0% |
| Duplicate Rows | 0 |
| Duplicate Rows (%) | 0.0% |
| Total Size in Memory | 568.7 KB |
| Average Row Size in Memory | 280.8 B |
| Categorical | 1 |
|---|---|
| DateTime | 1 |
| Numerical | 25 |
categorical
| Distinct Count | 125 |
|---|---|
| Unique (%) | 6.0% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Memory Size | 147.5 KB |
| Mean | 7.8038 |
|---|---|
| Standard Deviation | 3.1386 |
| Median | 7 |
| Minimum | 4 |
| Maximum | 24 |
| 1st row | Albania |
|---|---|
| 2nd row | Albania |
| 3rd row | Albania |
| 4th row | Algeria |
| 5th row | Algeria |
| Count | 15929 |
|---|---|
| Lowercase Letter | 13633 |
| Space Separator | 239 |
| Uppercase Letter | 2296 |
| Dash Punctuation | 17 |
| Decimal Number | 0 |
datetime
| Distinct Count | 17 |
|---|---|
| Unique (%) | 0.8% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Memory Size | 32.4 KB |
| Minimum | 2000-01-01 00:00:00 |
| Maximum | 2016-01-01 00:00:00 |
numerical
| Distinct Count | 46 |
|---|---|
| Unique (%) | 2.2% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 68.7083 |
| Minimum | 39 |
| Maximum | 84 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 39 |
|---|---|
| 5-th Percentile | 49.65 |
| Q1 | 62 |
| Median | 71 |
| Q3 | 76 |
| 95-th Percentile | 81 |
| Maximum | 84 |
| Range | 45 |
| IQR | 14 |
| Mean | 68.7083 |
|---|---|
| Standard Deviation | 10.0026 |
| Variance | 100.0523 |
| Sum | 142501 |
| Skewness | -0.7547 |
| Kurtosis | -0.3761 |
| Coefficient of Variation | 0.1456 |
numerical
| Distinct Count | 94 |
|---|---|
| Unique (%) | 4.5% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 24.8506 |
| Minimum | 19.8 |
| Maximum | 29.1 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 19.8 |
|---|---|
| 5-th Percentile | 21.4 |
| Q1 | 23.2 |
| Median | 25.5 |
| Q3 | 26.3 |
| 95-th Percentile | 27.3 |
| Maximum | 29.1 |
| Range | 9.3 |
| IQR | 3.1 |
| Mean | 24.8506 |
|---|---|
| Standard Deviation | 1.9323 |
| Variance | 3.7337 |
| Sum | 51540.2 |
| Skewness | -0.5303 |
| Kurtosis | -0.7565 |
| Coefficient of Variation | 0.07776 |
numerical
| Distinct Count | 652 |
|---|---|
| Unique (%) | 31.4% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 35.8725 |
| Minimum | 9.3 |
| Maximum | 88.4 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 9.3 |
|---|---|
| 5-th Percentile | 11.7 |
| Q1 | 18.8 |
| Median | 29.25 |
| Q3 | 50.5 |
| 95-th Percentile | 77.7 |
| Maximum | 88.4 |
| Range | 79.1 |
| IQR | 31.7 |
| Mean | 35.8725 |
|---|---|
| Standard Deviation | 21.6236 |
| Variance | 467.5794 |
| Sum | 74399.5 |
| Skewness | 0.7696 |
| Kurtosis | -0.6317 |
| Coefficient of Variation | 0.6028 |
numerical
| Distinct Count | 1006 |
|---|---|
| Unique (%) | 48.5% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 5.2056 |
| Minimum | 0 |
| Maximum | 17.87 |
| Zeros | 25 |
| Zeros (%) | 1.2% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0 |
|---|---|
| 5-th Percentile | 0.19 |
| Q1 | 1.68 |
| Median | 4.485 |
| Q3 | 8.1 |
| 95-th Percentile | 12.1 |
| Maximum | 17.87 |
| Range | 17.87 |
| IQR | 6.42 |
| Mean | 5.2056 |
|---|---|
| Standard Deviation | 3.9504 |
| Variance | 15.6055 |
| Sum | 10796.33 |
| Skewness | 0.5139 |
| Kurtosis | -0.6963 |
| Coefficient of Variation | 0.7589 |
numerical
| Distinct Count | 219 |
|---|---|
| Unique (%) | 10.6% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 9416.6428 |
| Minimum | 100 |
| Maximum | 290000 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 100 |
|---|---|
| 5-th Percentile | 100 |
| Q1 | 500 |
| Median | 3800 |
| Q3 | 9416.6428 |
| 95-th Percentile | 42350 |
| Maximum | 290000 |
| Range | 289900 |
| IQR | 8916.6428 |
| Mean | 9416.6428 |
|---|---|
| Standard Deviation | 22287.9745 |
| Variance | 4.9675e+08 |
| Sum | 1.953e+07 |
| Skewness | 6.8164 |
| Kurtosis | 62.0487 |
| Coefficient of Variation | 2.3669 |
numerical
| Distinct Count | 76 |
|---|---|
| Unique (%) | 3.7% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 89.9038 |
| Minimum | 16 |
| Maximum | 99 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 16 |
|---|---|
| 5-th Percentile | 69 |
| Q1 | 89.9038 |
| Median | 92 |
| Q3 | 98 |
| 95-th Percentile | 99 |
| Maximum | 99 |
| Range | 83 |
| IQR | 8.0962 |
| Mean | 89.9038 |
|---|---|
| Standard Deviation | 12.4791 |
| Variance | 155.7282 |
| Sum | 186460.5769 |
| Skewness | -3.0251 |
| Kurtosis | 11.3404 |
| Coefficient of Variation | 0.1388 |
numerical
| Distinct Count | 445 |
|---|---|
| Unique (%) | 21.5% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 193.2985 |
| Minimum | 49 |
| Maximum | 683 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 49 |
|---|---|
| 5-th Percentile | 62.65 |
| Q1 | 101 |
| Median | 161 |
| Q3 | 255 |
| 95-th Percentile | 445.35 |
| Maximum | 683 |
| Range | 634 |
| IQR | 154 |
| Mean | 193.2985 |
|---|---|
| Standard Deviation | 119.867 |
| Variance | 14368.0976 |
| Sum | 400901 |
| Skewness | 1.2728 |
| Kurtosis | 1.4242 |
| Coefficient of Variation | 0.6201 |
numerical
| Distinct Count | 124 |
|---|---|
| Unique (%) | 6.0% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 75008.4851 |
| Minimum | 18 |
| Maximum | 2.0254e+06 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 18 |
|---|---|
| 5-th Percentile | 101 |
| Q1 | 979 |
| Median | 6704 |
| Q3 | 52218 |
| 95-th Percentile | 219826 |
| Maximum | 2.0254e+06 |
| Range | 2.0254e+06 |
| IQR | 51239 |
| Mean | 75008.4851 |
|---|---|
| Standard Deviation | 254538.4035 |
| Variance | 6.479e+10 |
| Sum | 1.5557e+08 |
| Skewness | 6.1166 |
| Kurtosis | 39.4687 |
| Coefficient of Variation | 3.3935 |
numerical
| Distinct Count | 2073 |
|---|---|
| Unique (%) | 100.0% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 4.3879e+07 |
| Minimum | 247315 |
| Maximum | 1.3787e+09 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 247315 |
|---|---|
| 5-th Percentile | 515042.65 |
| Q1 | 3.5037e+06 |
| Median | 9.4696e+06 |
| Q3 | 2.4687e+07 |
| 95-th Percentile | 1.4258e+08 |
| Maximum | 1.3787e+09 |
| Range | 1.3784e+09 |
| IQR | 2.1183e+07 |
| Mean | 4.3879e+07 |
|---|---|
| Standard Deviation | 1.6239e+08 |
| Variance | 2.6371e+16 |
| Sum | 9.1006e+10 |
| Skewness | 7.019 |
| Kurtosis | 50.2303 |
| Coefficient of Variation | 3.7009 |
numerical
| Distinct Count | 1105 |
|---|---|
| Unique (%) | 53.3% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 6.6446 |
| Minimum | 0.01 |
| Maximum | 22.35 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0.01 |
|---|---|
| 5-th Percentile | 0.46 |
| Q1 | 1.75 |
| Median | 6.13 |
| Q3 | 10.18 |
| 95-th Percentile | 15.721 |
| Maximum | 22.35 |
| Range | 22.34 |
| IQR | 8.43 |
| Mean | 6.6446 |
|---|---|
| Standard Deviation | 5.0699 |
| Variance | 25.7043 |
| Sum | 13780.87 |
| Skewness | 0.5265 |
| Kurtosis | -0.6652 |
| Coefficient of Variation | 0.763 |
numerical
| Distinct Count | 1381 |
|---|---|
| Unique (%) | 66.6% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 11.6437 |
| Minimum | 0.26 |
| Maximum | 59.09 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0.26 |
|---|---|
| 5-th Percentile | 1.5795 |
| Q1 | 4.99 |
| Median | 8.325 |
| Q3 | 17.35 |
| 95-th Percentile | 28.1235 |
| Maximum | 59.09 |
| Range | 58.83 |
| IQR | 12.36 |
| Mean | 11.6437 |
|---|---|
| Standard Deviation | 9.2928 |
| Variance | 86.3565 |
| Sum | 24149.09 |
| Skewness | 1.5618 |
| Kurtosis | 3.4582 |
| Coefficient of Variation | 0.7981 |
numerical
| Distinct Count | 726 |
|---|---|
| Unique (%) | 35.0% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 3.3526 |
| Minimum | 0 |
| Maximum | 55.4 |
| Zeros | 20 |
| Zeros (%) | 1.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0 |
|---|---|
| 5-th Percentile | 0.04 |
| Q1 | 0.6 |
| Median | 1.445 |
| Q3 | 3.49 |
| 95-th Percentile | 13.027 |
| Maximum | 55.4 |
| Range | 55.4 |
| IQR | 2.89 |
| Mean | 3.3526 |
|---|---|
| Standard Deviation | 5.7487 |
| Variance | 33.0476 |
| Sum | 6953.33 |
| Skewness | 4.3163 |
| Kurtosis | 24.5375 |
| Coefficient of Variation | 1.7147 |
numerical
| Distinct Count | 501 |
|---|---|
| Unique (%) | 24.2% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 1.584 |
| Minimum | 0 |
| Maximum | 22.31 |
| Zeros | 199 |
| Zeros (%) | 9.6% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0 |
|---|---|
| 5-th Percentile | 0 |
| Q1 | 0.1 |
| Median | 0.71 |
| Q3 | 1.9 |
| 95-th Percentile | 6.1405 |
| Maximum | 22.31 |
| Range | 22.31 |
| IQR | 1.8 |
| Mean | 1.584 |
|---|---|
| Standard Deviation | 2.6958 |
| Variance | 7.2676 |
| Sum | 3285.3 |
| Skewness | 3.6449 |
| Kurtosis | 16.6287 |
| Coefficient of Variation | 1.7019 |
numerical
| Distinct Count | 1307 |
|---|---|
| Unique (%) | 63.0% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 12.8389 |
| Minimum | 0 |
| Maximum | 64.24 |
| Zeros | 72 |
| Zeros (%) | 3.5% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0 |
|---|---|
| 5-th Percentile | 0.01 |
| Q1 | 1.145 |
| Median | 5.61 |
| Q3 | 22.34 |
| 95-th Percentile | 42.522 |
| Maximum | 64.24 |
| Range | 64.24 |
| IQR | 21.195 |
| Mean | 12.8389 |
|---|---|
| Standard Deviation | 14.9566 |
| Variance | 223.7009 |
| Sum | 26627.93 |
| Skewness | 1.1647 |
| Kurtosis | 0.2275 |
| Coefficient of Variation | 1.1649 |
numerical
| Distinct Count | 1522 |
|---|---|
| Unique (%) | 73.4% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 15.5803 |
| Minimum | 0.05 |
| Maximum | 72.74 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0.05 |
|---|---|
| 5-th Percentile | 0.8595 |
| Q1 | 4.0025 |
| Median | 13.63 |
| Q3 | 23.72 |
| 95-th Percentile | 38.1445 |
| Maximum | 72.74 |
| Range | 72.69 |
| IQR | 19.7175 |
| Mean | 15.5803 |
|---|---|
| Standard Deviation | 12.8611 |
| Variance | 165.4069 |
| Sum | 32313.6 |
| Skewness | 1.0476 |
| Kurtosis | 1.3842 |
| Coefficient of Variation | 0.8255 |
numerical
| Distinct Count | 1980 |
|---|---|
| Unique (%) | 95.5% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 116.448 |
| Minimum | 1.02 |
| Maximum | 463.91 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 1.02 |
|---|---|
| 5-th Percentile | 6.624 |
| Q1 | 31.9475 |
| Median | 98.2 |
| Q3 | 178.51 |
| 95-th Percentile | 291.8985 |
| Maximum | 463.91 |
| Range | 462.89 |
| IQR | 146.5625 |
| Mean | 116.448 |
|---|---|
| Standard Deviation | 94.9286 |
| Variance | 9011.434 |
| Sum | 241513.11 |
| Skewness | 0.7646 |
| Kurtosis | -0.237 |
| Coefficient of Variation | 0.8152 |
numerical
| Distinct Count | 1577 |
|---|---|
| Unique (%) | 76.0% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 17.395 |
| Minimum | 0.07 |
| Maximum | 191.75 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0.07 |
|---|---|
| 5-th Percentile | 1.3165 |
| Q1 | 5.1825 |
| Median | 12.16 |
| Q3 | 24 |
| 95-th Percentile | 47.607 |
| Maximum | 191.75 |
| Range | 191.68 |
| IQR | 18.8175 |
| Mean | 17.395 |
|---|---|
| Standard Deviation | 19.817 |
| Variance | 392.7147 |
| Sum | 36077.21 |
| Skewness | 3.9626 |
| Kurtosis | 25.1503 |
| Coefficient of Variation | 1.1392 |
numerical
| Distinct Count | 2074 |
|---|---|
| Unique (%) | 100.0% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 6.3738 |
| Minimum | 1.7014 |
| Maximum | 20.4134 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 1.7014 |
|---|---|
| 5-th Percentile | 2.8371 |
| Q1 | 4.4335 |
| Median | 6.1969 |
| Q3 | 8.1496 |
| 95-th Percentile | 10.3428 |
| Maximum | 20.4134 |
| Range | 18.712 |
| IQR | 3.7161 |
| Mean | 6.3738 |
|---|---|
| Standard Deviation | 2.3791 |
| Variance | 5.6603 |
| Sum | 13219.2983 |
| Skewness | 0.3744 |
| Kurtosis | 0.2049 |
| Coefficient of Variation | 0.3733 |
numerical
| Distinct Count | 78 |
|---|---|
| Unique (%) | 3.8% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 86.6432 |
| Minimum | 19 |
| Maximum | 99 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 19 |
|---|---|
| 5-th Percentile | 51 |
| Q1 | 83 |
| Median | 92 |
| Q3 | 97 |
| 95-th Percentile | 99 |
| Maximum | 99 |
| Range | 80 |
| IQR | 14 |
| Mean | 86.6432 |
|---|---|
| Standard Deviation | 14.9612 |
| Variance | 223.8369 |
| Sum | 179698 |
| Skewness | -1.9149 |
| Kurtosis | 3.5448 |
| Coefficient of Variation | 0.1727 |
numerical
| Distinct Count | 362 |
|---|---|
| Unique (%) | 17.4% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 12.6951 |
| Minimum | 0.1 |
| Maximum | 116.2 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0.1 |
|---|---|
| 5-th Percentile | 2.9 |
| Q1 | 6 |
| Median | 10.4 |
| Q3 | 15.4 |
| 95-th Percentile | 30.27 |
| Maximum | 116.2 |
| Range | 116.1 |
| IQR | 9.4 |
| Mean | 12.6951 |
|---|---|
| Standard Deviation | 10.8287 |
| Variance | 117.2611 |
| Sum | 26329.6 |
| Skewness | 3.6664 |
| Kurtosis | 22.6907 |
| Coefficient of Variation | 0.853 |
numerical
| Distinct Count | 1796 |
|---|---|
| Unique (%) | 86.6% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 595.0553 |
| Minimum | 240.4 |
| Maximum | 1317.7 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 240.4 |
|---|---|
| 5-th Percentile | 328.125 |
| Q1 | 437.325 |
| Median | 588.35 |
| Q3 | 710.125 |
| 95-th Percentile | 935.445 |
| Maximum | 1317.7 |
| Range | 1077.3 |
| IQR | 272.8 |
| Mean | 595.0553 |
|---|---|
| Standard Deviation | 194.1573 |
| Variance | 37697.0431 |
| Sum | 1.2341e+06 |
| Skewness | 0.644 |
| Kurtosis | 0.357 |
| Coefficient of Variation | 0.3263 |
numerical
| Distinct Count | 83 |
|---|---|
| Unique (%) | 4.0% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 1.4701 |
| Minimum | 0 |
| Maximum | 9.2 |
| Zeros | 31 |
| Zeros (%) | 1.5% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0 |
|---|---|
| 5-th Percentile | 0.1 |
| Q1 | 0.3 |
| Median | 0.7 |
| Q3 | 2.5 |
| 95-th Percentile | 4.4 |
| Maximum | 9.2 |
| Range | 9.2 |
| IQR | 2.2 |
| Mean | 1.4701 |
|---|---|
| Standard Deviation | 1.5467 |
| Variance | 2.3922 |
| Sum | 3048.9 |
| Skewness | 1.469 |
| Kurtosis | 2.2657 |
| Coefficient of Variation | 1.0521 |
numerical
| Distinct Count | 90 |
|---|---|
| Unique (%) | 4.3% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 84.7931 |
| Minimum | 2 |
| Maximum | 99 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 2 |
|---|---|
| 5-th Percentile | 48 |
| Q1 | 84 |
| Median | 88 |
| Q3 | 95 |
| 95-th Percentile | 99 |
| Maximum | 99 |
| Range | 97 |
| IQR | 11 |
| Mean | 84.7931 |
|---|---|
| Standard Deviation | 15.734 |
| Variance | 247.5587 |
| Sum | 175860.8966 |
| Skewness | -2.3798 |
| Kurtosis | 6.6278 |
| Coefficient of Variation | 0.1856 |
numerical
| Distinct Count | 74 |
|---|---|
| Unique (%) | 3.6% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 86.7208 |
| Minimum | 8 |
| Maximum | 99 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 8 |
|---|---|
| 5-th Percentile | 53 |
| Q1 | 82 |
| Median | 93 |
| Q3 | 97 |
| 95-th Percentile | 99 |
| Maximum | 99 |
| Range | 91 |
| IQR | 15 |
| Mean | 86.7208 |
|---|---|
| Standard Deviation | 14.6434 |
| Variance | 214.43 |
| Sum | 179859 |
| Skewness | -1.8239 |
| Kurtosis | 3.3041 |
| Coefficient of Variation | 0.1689 |
numerical
| Distinct Count | 1766 |
|---|---|
| Unique (%) | 85.2% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 10668.9849 |
| Minimum | 0 |
| Maximum | 615058 |
| Zeros | 1 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0 |
|---|---|
| 5-th Percentile | 8.43 |
| Q1 | 67.0125 |
| Median | 583.6 |
| Q3 | 3110.225 |
| 95-th Percentile | 49029 |
| Maximum | 615058 |
| Range | 615058 |
| IQR | 3043.2125 |
| Mean | 10668.9849 |
|---|---|
| Standard Deviation | 50450.6869 |
| Variance | 2.5453e+09 |
| Sum | 2.2127e+07 |
| Skewness | 9.2523 |
| Kurtosis | 93.7958 |
| Coefficient of Variation | 4.7287 |
# Using label encoder for the categorical columns with text values
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
# TODO : Encode labels "Country" and "Year"
dataset['Country'] = le.fit_transform(dataset['Country'])
dataset['Year'] = le.fit_transform(dataset['Year'])
dataset.head()
| Country | Year | Life expectancy | BMI | ChildMalnutrition | Alcohol | HIV | BCG | Adult Mortality | ChildMortality | ... | Milk Consumption | Fish and Seafood | Medical Expenditure | Diphtheria | Suicides | NCD | Env Pollution | HepatitisB | Polio | Tuberculosis | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 11 | 76 | 26.2 | 22.9 | 5.03 | 100.0 | 97.0 | 103 | 868 | ... | 301.27 | 5.86 | 4.795327 | 99.0 | 7.6 | 490.2 | 0.5 | 99.0 | 99.0 | 9.29 |
| 1 | 1 | 12 | 77 | 26.3 | 23.1 | 4.43 | 100.0 | 96.0 | 103 | 868 | ... | 299.85 | 4.97 | 5.055262 | 99.0 | 4.8 | 507.1 | 0.4 | 99.0 | 99.0 | 9.29 |
| 2 | 1 | 13 | 77 | 26.4 | 23.6 | 4.28 | 100.0 | 99.0 | 100 | 868 | ... | 303.72 | 4.87 | 5.385599 | 99.0 | 4.8 | 486.4 | 0.4 | 99.0 | 99.0 | 8.29 |
| 3 | 2 | 14 | 75 | 25.4 | 33.9 | 0.54 | 200.0 | 99.0 | 98 | 60319 | ... | 151.06 | 4.40 | 6.547214 | 95.0 | 2.8 | 464.4 | 0.8 | 95.0 | 95.0 | 3107.90 |
| 4 | 2 | 15 | 76 | 25.5 | 33.9 | 0.55 | 200.0 | 99.0 | 96 | 60319 | ... | 125.37 | 4.16 | 6.978492 | 95.0 | 2.7 | 460.7 | 0.7 | 95.0 | 95.0 | 3208.10 |
5 rows × 27 columns
# Plot the correleation between all the features in the dataset
# Identify the strongly related related variables with Life Expectancy
#Using Pearson Correlation
plt.figure(figsize=(20,15))
correlation_matrix = dataset.corr()
sns.heatmap(correlation_matrix, annot=True, vmin=-1, vmax=1, center= 0, cmap= 'coolwarm')
plt.show()
# Correlation with output variable Life expectancy
cor_target = correlation_matrix["Life expectancy"]
# View highly correlated features
relevant_features = cor_target[cor_target>=-1]
relevant_features.sort_values(ascending=False)
Life expectancy 1.000000 Eggs Consumption 0.708943 Milk Consumption 0.659305 BMI 0.654637 Polio 0.649462 Diphtheria 0.641525 Pig Meat 0.591232 Poultry Meat 0.590431 Alcohol 0.450114 Bovine Meat 0.439087 Medical Expenditure 0.364535 Fish and Seafood 0.330970 HepatitisB 0.244216 BCG 0.235935 Year 0.183378 Mutton & Goat meat 0.056031 Population 0.010906 Country -0.067061 Other Meat -0.090164 Tuberculosis -0.106261 ChildMortality -0.250402 HIV -0.326044 Suicides -0.429654 NCD -0.706556 Env Pollution -0.708975 ChildMalnutrition -0.882725 Adult Mortality -0.948039 Name: Life expectancy, dtype: float64
# Drop features with no strong correlation with the target variable from the dataset
# HepatitisB 0.244216
# BCG 0.235935
# Year 0.183378
# Mutton & Goat meat 0.056031
# Population 0.010906
# Country -0.067061
# Other Meat -0.090164
# Tuberculosis -0.106261
dataset=dataset.drop('HepatitisB',axis=1)
dataset=dataset.drop('BCG',axis=1)
dataset=dataset.drop('Year',axis=1)
dataset=dataset.drop('Mutton & Goat meat',axis=1)
dataset=dataset.drop('Population',axis=1)
dataset=dataset.drop('Country',axis=1)
dataset=dataset.drop('Other Meat',axis=1)
dataset=dataset.drop('Tuberculosis',axis=1)
# Print first 6 rows in dataset
dataset.head()
| Life expectancy | BMI | ChildMalnutrition | Alcohol | HIV | Adult Mortality | ChildMortality | Eggs Consumption | Bovine Meat | Pig Meat | Poultry Meat | Milk Consumption | Fish and Seafood | Medical Expenditure | Diphtheria | Suicides | NCD | Env Pollution | Polio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 76 | 26.2 | 22.9 | 5.03 | 100.0 | 103 | 868 | 7.72 | 21.24 | 11.03 | 13.41 | 301.27 | 5.86 | 4.795327 | 99.0 | 7.6 | 490.2 | 0.5 | 99.0 |
| 1 | 77 | 26.3 | 23.1 | 4.43 | 100.0 | 103 | 868 | 12.69 | 22.40 | 11.04 | 12.76 | 299.85 | 4.97 | 5.055262 | 99.0 | 4.8 | 507.1 | 0.4 | 99.0 |
| 2 | 77 | 26.4 | 23.6 | 4.28 | 100.0 | 100 | 868 | 12.45 | 22.50 | 10.88 | 13.23 | 303.72 | 4.87 | 5.385599 | 99.0 | 4.8 | 486.4 | 0.4 | 99.0 |
| 3 | 75 | 25.4 | 33.9 | 0.54 | 200.0 | 98 | 60319 | 7.93 | 5.43 | 0.00 | 6.86 | 151.06 | 4.40 | 6.547214 | 95.0 | 2.8 | 464.4 | 0.8 | 95.0 |
| 4 | 76 | 25.5 | 33.9 | 0.55 | 200.0 | 96 | 60319 | 8.65 | 5.35 | 0.00 | 6.64 | 125.37 | 4.16 | 6.978492 | 95.0 | 2.7 | 460.7 | 0.7 | 95.0 |
# TODO : Feature Scaling
#transform the data to be on same scale using sklearn's StandardScaler()
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X = dataset.drop('Life expectancy',axis=1)
y = dataset['Life expectancy'].astype('int')
# Print the features "X" and target variable "y"
print( X , y)
BMI ChildMalnutrition Alcohol HIV Adult Mortality \
0 26.2 22.9 5.03 100.0 103
1 26.3 23.1 4.43 100.0 103
2 26.4 23.6 4.28 100.0 100
3 25.4 33.9 0.54 200.0 98
4 25.5 33.9 0.55 200.0 96
... ... ... ... ... ...
2851 23.7 48.3 3.91 40000.0 440
2852 23.7 46.2 3.93 33000.0 407
2853 23.7 44.3 4.11 28000.0 383
2854 23.8 42.8 4.22 25000.0 358
2855 23.8 41.7 3.84 24000.0 346
ChildMortality Eggs Consumption Bovine Meat Pig Meat Poultry Meat \
0 868 7.72 21.24 11.03 13.41
1 868 12.69 22.40 11.04 12.76
2 868 12.45 22.50 10.88 13.23
3 60319 7.93 5.43 0.00 6.86
4 60319 8.65 5.35 0.00 6.64
... ... ... ... ... ...
2851 52218 1.78 7.66 2.81 6.69
2852 52218 1.78 7.53 2.88 6.72
2853 52218 1.75 7.37 2.65 4.97
2854 52218 1.93 7.26 1.84 4.49
2855 52218 1.84 7.25 1.74 4.67
Milk Consumption Fish and Seafood Medical Expenditure Diphtheria \
0 301.27 5.86 4.795327 99.0
1 299.85 4.97 5.055262 99.0
2 303.72 4.87 5.385599 99.0
3 151.06 4.40 6.547214 95.0
4 125.37 4.16 6.978492 95.0
... ... ... ... ...
2851 34.21 2.99 8.081738 93.0
2852 31.07 2.91 6.918353 95.0
2853 31.90 2.82 7.110148 95.0
2854 30.54 3.39 8.133524 91.0
2855 27.38 3.82 7.452066 87.0
Suicides NCD Env Pollution Polio
0 7.6 490.2 0.5 99.0
1 4.8 507.1 0.4 99.0
2 4.8 486.4 0.4 99.0
3 2.8 464.4 0.8 95.0
4 2.7 460.7 0.7 95.0
... ... ... ... ...
2851 34.3 842.2 4.6 93.0
2852 33.1 826.4 4.5 95.0
2853 31.4 810.2 4.5 95.0
2854 30.8 804.3 4.4 92.0
2855 30.7 800.1 4.3 88.0
[2074 rows x 18 columns] 0 76
1 77
2 77
3 75
4 76
..
2851 52
2852 55
2853 56
2854 58
2855 59
Name: Life expectancy, Length: 2074, dtype: int32
X = scale.fit_transform(X)
X
array([[ 0.69850001, -0.60006687, -0.04445178, ..., -0.54018371,
-0.62734346, 0.83874704],
[ 0.75026479, -0.59081548, -0.19637236, ..., -0.45311988,
-0.69201418, 0.83874704],
[ 0.80202956, -0.567687 , -0.2343525 , ..., -0.5597602 ,
-0.69201418, 0.83874704],
...,
[-0.59561939, 0.38983192, -0.27739667, ..., 1.10836225,
1.95948557, 0.56552109],
[-0.54385461, 0.32044649, -0.24954456, ..., 1.07796719,
1.89481484, 0.36060162],
[-0.54385461, 0.26956384, -0.34576093, ..., 1.05633002,
1.83014412, 0.08737566]])
#splitting the data into my train and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=0)
#from sklearn import utils
#print(utils.multiclass.type_of_target(y_train))
#print(utils.multiclass.type_of_target(y_train.astype('int')))
y_train = y_train.astype('int')
y_test = y_test.astype('int')
np.unique(y_train, return_counts=True)
(array([39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72,
73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84]),
array([ 1, 1, 1, 5, 6, 7, 7, 9, 12, 13, 12, 24, 24,
19, 19, 17, 21, 20, 33, 28, 27, 24, 27, 26, 26, 18,
24, 37, 38, 33, 46, 57, 65, 62, 71, 107, 74, 47, 36,
66, 76, 70, 72, 37, 5, 1], dtype=int64))
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_train,y_train = ros.fit_resample(X, y)
np.unique(y_train, return_counts=True)
(array([39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72,
73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84]),
array([137, 137, 137, 137, 137, 137, 137, 137, 137, 137, 137, 137, 137,
137, 137, 137, 137, 137, 137, 137, 137, 137, 137, 137, 137, 137,
137, 137, 137, 137, 137, 137, 137, 137, 137, 137, 137, 137, 137,
137, 137, 137, 137, 137, 137, 137], dtype=int64))
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from math import sqrt
from sklearn.linear_model import LogisticRegression
#defining default values to store high accuracy and values of C at higher ccuracy
HIGH_ACCURACY =0
MAX_C = 1
# Looping for values of c from 1 till 20 to check the behaviour and accuracy of the model.
for c in range(1,20):
#defining linear model with default parameters and C=c
model = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=c,
fit_intercept=True, intercept_scaling=1, class_weight=None,
random_state=None, solver='lbfgs', max_iter=100, multi_class='auto',
verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)
#Fitting the model for the test and train sets
model = model.fit(X_train,y_train)
#Predicting the test values with the built model
pred_y = model.predict(X_test)
#printing the accuracy on the console
print("C = {} , Accuracy = {}".format(c, model.score(X_test, y_test)))
#if accuracy is high then store it in the variables
if(HIGH_ACCURACY < model.score(X_test, y_test)):
HIGH_ACCURACY = model.score(X_test, y_test)
MAX_C = c
#Printing the the highest accuracy achieved and the respective C value
print("*************************************")
print("Maximum Accuracy is achieved at C = {} , Accuracy = {}".format(MAX_C, HIGH_ACCURACY))
C = 1 , Accuracy = 0.3467094703049759 C = 2 , Accuracy = 0.36436597110754415 C = 3 , Accuracy = 0.3611556982343499 C = 4 , Accuracy = 0.3611556982343499 C = 5 , Accuracy = 0.36276083467094705 C = 6 , Accuracy = 0.36597110754414125 C = 7 , Accuracy = 0.36757624398073835 C = 8 , Accuracy = 0.36757624398073835 C = 9 , Accuracy = 0.36918138041733545 C = 10 , Accuracy = 0.3723916532905297 C = 11 , Accuracy = 0.3739967897271268 C = 12 , Accuracy = 0.36918138041733545 C = 13 , Accuracy = 0.36597110754414125 C = 14 , Accuracy = 0.38202247191011235 C = 15 , Accuracy = 0.3723916532905297 C = 16 , Accuracy = 0.36918138041733545 C = 17 , Accuracy = 0.3739967897271268 C = 18 , Accuracy = 0.36757624398073835 C = 19 , Accuracy = 0.36757624398073835 ************************************* Maximum Accuracy is achieved at C = 14 , Accuracy = 0.38202247191011235
# DEFINING MODEL WITH PARAMETER C=14 (HIGH ACCURACY)
model = LogisticRegression(C=14)
# TRAINING THE DEFINED MODEL
model = model.fit(X_train,y_train)
# PREDICTING THE TARGET WITH THE TRAINED MODEL
y_pred = model.predict(X_test)
# CALCULATING MEAN ABSOLUTE ERROR
MAE = mean_absolute_error(y_test, y_pred)
# CALCULATING THE MEAN SQUARED ERROR
MSE = mean_squared_error(y_test, y_pred)
# CALCULATING ROOT MEAN SQUARED ERROR
RMSE = sqrt(MSE)
# CALCULATING R2_SCORE
R2_SCORE=r2_score(y_test, y_pred)
# CALCULATING F1_SCORE
F1_SCORE = f1_score(y_test, y_pred, average='macro')
#PRINTING THE CALCULATED METRICS
print('LOGISTIC REGRESSION , C=14')
print('------------------------------')
# PRININTING THE ACCURACY OF THE MODEL WITH DEFINED C VALUE
print('Accuracy : {}'.format(model.score(X_test, y_test)))
print('MAE : {}'.format(round(MAE, 2)))
print('MSE : {}'.format(round(MSE, 2)))
print('RMSE : %f' % RMSE)
print('R2_SCORE : %f' % R2_SCORE)
print('F1_SCORE : %f' % F1_SCORE)
LOGISTIC REGRESSION , C=14 ------------------------------ Accuracy : 0.38202247191011235 MAE : 1.16 MSE : 3.83 RMSE : 1.958232 R2_SCORE : 0.960430 F1_SCORE : 0.340216
# GAUSSIAN NAIVE BAYES
#IMPORTING GAUSSIAN NAIVE BAYES PACKAGE
from sklearn.naive_bayes import GaussianNB
# DEFINING MODEL
gnb = GaussianNB()
# TRAINING THE DEFINED MODEL AND PREDICTING THE TARGET WITH TRAINED MODEL
y_pred = gnb.fit(X_train, y_train).predict(X_test)
# CALCULATING MEAN ABSOLUTE ERROR
MAE = mean_absolute_error(y_test, y_pred)
# CALCULATING THE MEAN SQUARED ERROR
MSE = mean_squared_error(y_test, y_pred)
# CALCULATING ROOT MEAN SQUARED ERROR
RMSE = sqrt(MSE)
# CALCULATING R2_SCORE
R2_SCORE=r2_score(y_test, y_pred)
# CALCULATING F1_SCORE
F1_SCORE = f1_score(y_test, y_pred, average='macro')
print('GAUSSIAN NAIVE BAYES ')
print('------------------------------')
print('Accuracy : {}'.format(gnb.score(X_test, y_test)))
print('MAE : {}'.format(round(MAE, 2)))
print('MSE : {}'.format(round(MSE, 2)))
print('RMSE : %f' % RMSE)
print('R2_SCORE : %f' % R2_SCORE)
print('F1_SCORE : %f' % F1_SCORE)
GAUSSIAN NAIVE BAYES ------------------------------ Accuracy : 0.30818619582664525 MAE : 1.63 MSE : 5.96 RMSE : 2.440956 R2_SCORE : 0.938516 F1_SCORE : 0.249275
# BernoulliNB Naive Bayes
from sklearn.naive_bayes import BernoulliNB
# DEFINING MODEL
bnb = BernoulliNB()
# TRAINING THE DEFINED MODEL AND PREDICTING THE TARGET WITH TRAINED MODEL
model = bnb.fit(X_train, y_train).predict(X_test)
# CALCULATING MEAN ABSOLUTE ERROR
MAE = mean_absolute_error(y_test, y_pred)
# CALCULATING THE MEAN SQUARED ERROR
MSE = mean_squared_error(y_test, y_pred)
# CALCULATING ROOT MEAN SQUARED ERROR
RMSE = sqrt(MSE)
# CALCULATING R2_SCORE
R2_SCORE=r2_score(y_test, y_pred)
# CALCULATING F1_SCORE
F1_SCORE = f1_score(y_test, y_pred, average='macro')
print('BERNOULLI NAIVE BAYES ')
print('------------------------------')
print('Accuracy : {}'.format(bnb.score(X_test, y_test)))
print('MAE : {}'.format(round(MAE, 2)))
print('MSE : {}'.format(round(MSE, 2)))
print('RMSE : %f' % RMSE)
print('R2_SCORE : %f' % R2_SCORE)
print('F1_SCORE : %f' % F1_SCORE)
BERNOULLI NAIVE BAYES ------------------------------ Accuracy : 0.22150882825040127 MAE : 1.63 MSE : 5.96 RMSE : 2.440956 R2_SCORE : 0.938516 F1_SCORE : 0.249275
from sklearn.neighbors import KNeighborsClassifier
#defining default values to store high accuracy and values of n at higher ccuracy
HIGH_ACCURACY =0
MAX_N = 1
# Looping for values of n from 5 till 20 to check the behaviour and accuracy of the model.
for n in range(5,20):
#defining model with default parameters and n_neighbors = n
knn = KNeighborsClassifier(n_neighbors = n)
#Fitting the model for the test and train sets
knn.fit(X_train,y_train)
#Predicting the test values with the built model
y_pred = knn.predict(X_test)
#printing the accuracy on the console
print('n = {} , Accuracy = {}'.format(n, knn.score(X_test, y_test)))
#if accuracy is high then store it in the variables
if(HIGH_ACCURACY < knn.score(X_test, y_test)):
HIGH_ACCURACY = knn.score(X_test, y_test)
MAX_N= n
# print('KNeighborsClassifier')
#Printing the the highest accuracy achieved and the respective C value
print("*************************************")
print("Maximum Accuracy is achieved at n = {} , Accuracy = {}".format(MAX_N, HIGH_ACCURACY))
n = 5 , Accuracy = 0.8507223113964687 n = 6 , Accuracy = 0.797752808988764 n = 7 , Accuracy = 0.7736757624398074 n = 8 , Accuracy = 0.7223113964686998 n = 9 , Accuracy = 0.6934189406099518 n = 10 , Accuracy = 0.666131621187801 n = 11 , Accuracy = 0.6324237560192616 n = 12 , Accuracy = 0.5858747993579454 n = 13 , Accuracy = 0.565008025682183 n = 14 , Accuracy = 0.5280898876404494 n = 15 , Accuracy = 0.5136436597110754 n = 16 , Accuracy = 0.4767255216693419 n = 17 , Accuracy = 0.47191011235955055 n = 18 , Accuracy = 0.47351524879614765 n = 19 , Accuracy = 0.4510433386837881 ************************************* Maximum Accuracy is achieved at n = 5 , Accuracy = 0.8507223113964687
print('KNeighborsClassifier')
print('------------------------------')
# initialising the classifier for n=5
knn = KNeighborsClassifier(n_neighbors = 5)
# applying the model for the test values
knn.fit(X_train,y_train)
# predicting the out put values for test inputs
y_pred = knn.predict(X_test)
print('Accuracy : {}'.format(knn.score(X_test, y_test)))
MAE = mean_absolute_error(y_test, y_pred)
print('MAE : {}'.format(round(MAE, 2)))
MSE = mean_squared_error(y_test, y_pred)
print('MSE : {}'.format(round(MSE, 2)))
RMSE = sqrt(MSE)
print('RMSE : %f' % RMSE)
R2_SCORE=r2_score(y_test, y_pred)
print('R2_SCORE : %f' % R2_SCORE)
F1_SCORE = f1_score(y_test, y_pred, average='macro')
print('F1_SCORE : %f' % F1_SCORE)
KNeighborsClassifier ------------------------------ Accuracy : 0.8507223113964687 MAE : 0.17 MSE : 0.21 RMSE : 0.458555 R2_SCORE : 0.997830 F1_SCORE : 0.892837
#SVC, NuSVC and LinearSVC perform both binary and multi-class classification on a dataset.
#Importing the necessary packages and libaries
from sklearn import svm
print('SUPPORT VECTOR CLASSIFICATION ')
# LIST OF KERNALS
kernal = ["linear","rbf",'poly','sigmoid']
#LOOPING THROUGH THE LIST OF KERNALS
for k in kernal:
print('------------------------------')
# LOOPING THROUGH THE PARAMETER g TO TUNE OUR MODEL ACCORDINGLY
for g in ["auto" ,"scale"]:
HIGH_ACC = 0
Max_C = 1
# LOOPING THROUGH VALUE OF C FROM 1 TO 15
# higher value of c gives l2 penality --> overfitting
for c in range(1,15):
# DEFINING THE MODEL
model = svm.SVC(C=c, kernel=k, gamma= g, decision_function_shape='ovo')
# TRAINING THE CREATED MODEL
model = model.fit(X_train,y_train)
# PREDICTING THE TATRGET WITH TEST VALUES
pred_y = model.predict(X_test)
# STORING THE HIGH ACCURACY AND CORRESPONDING C AND g VALUES
if HIGH_ACC < model.score(X_test,y_test):
HIGH_ACC = model.score(X_test,y_test)
Max_C = c
# PRINTING THE HIGH ACCURACY FOR ALL THE KERNAL WITH DIFF PARAMETER COMBINATIONS
print("Kernal = {} , C = {} , gamma = {} - MaxAccuracy = {}".format(k,Max_C,g,HIGH_ACC))
SUPPORT VECTOR CLASSIFICATION ------------------------------ Kernal = linear , C = 10 , gamma = auto - MaxAccuracy = 0.6773675762439807 Kernal = linear , C = 10 , gamma = scale - MaxAccuracy = 0.6773675762439807 ------------------------------ Kernal = rbf , C = 14 , gamma = auto - MaxAccuracy = 0.8154093097913323 Kernal = rbf , C = 14 , gamma = scale - MaxAccuracy = 0.7479935794542536 ------------------------------ Kernal = poly , C = 14 , gamma = auto - MaxAccuracy = 0.7897271268057785 Kernal = poly , C = 14 , gamma = scale - MaxAccuracy = 0.6821829855537721 ------------------------------ Kernal = sigmoid , C = 4 , gamma = auto - MaxAccuracy = 0.22632423756019263 Kernal = sigmoid , C = 5 , gamma = scale - MaxAccuracy = 0.28892455858747995
print('Linear SVC')
print('------------------------------')
# initialising the classifier
linear = svm.SVC(kernel='linear', C=10, decision_function_shape='ovo')
# applying the model for the test values
linear.fit(X_train,y_train)
# predicting the out put values for test inputs
y_pred = linear.predict(X_test)
print('Accuracy : {}'.format(linear.score(X_test, y_test)))
MAE = mean_absolute_error(y_test, y_pred)
print('MAE : {}'.format(round(MAE, 2)))
MSE = mean_squared_error(y_test, y_pred)
print('MSE : {}'.format(round(MSE, 2)))
RMSE = sqrt(MSE)
print('RMSE : %f' % RMSE)
R2_SCORE=r2_score(y_test, y_pred)
print('R2_SCORE : %f' % R2_SCORE)
F1_SCORE = f1_score(y_test, y_pred, average='macro')
print('F1_SCORE : %f' % F1_SCORE)
Linear SVC ------------------------------ Accuracy : 0.6773675762439807 MAE : 0.4 MSE : 0.58 RMSE : 0.763325 R2_SCORE : 0.993987 F1_SCORE : 0.703947
print('Radial Basis Function - SVC')
print('------------------------------')
# initialising the classifier
rbf = svm.SVC(kernel='rbf', gamma="auto", C=14, decision_function_shape='ovo')
# applying the model for the test values
rbf.fit(X_train,y_train)
# predicting the out put values for test inputs
y_pred = rbf.predict(X_test)
print('Accuracy : {}'.format(rbf.score(X_test, y_test)))
MAE = mean_absolute_error(y_test, y_pred)
print('MAE : {}'.format(round(MAE, 2)))
MSE = mean_squared_error(y_test, y_pred)
print('MSE : {}'.format(round(MSE, 2)))
RMSE = sqrt(MSE)
print('RMSE : %f' % RMSE)
R2_SCORE=r2_score(y_test, y_pred)
print('R2_SCORE : %f' % R2_SCORE)
F1_SCORE = f1_score(y_test, y_pred, average='macro')
print('F1_SCORE : %f' % F1_SCORE)
Radial Basis Function - SVC ------------------------------ Accuracy : 0.8154093097913323 MAE : 0.21 MSE : 0.29 RMSE : 0.541978 R2_SCORE : 0.996969 F1_SCORE : 0.812503
# polynomial kernel function
poly = svm.SVC(kernel='poly', degree=3, C=14, gamma="auto", decision_function_shape='ovo').fit(X_train, y_train)
print('Polynomial Kernal Function - SVC')
print('------------------------------')
# predicting the out put values for test inputs
y_pred = rbf.predict(X_test)
print('Accuracy : {}'.format(poly.score(X_test, y_test)))
MAE = mean_absolute_error(y_test, y_pred)
print('MAE : {}'.format(round(MAE, 2)))
MSE = mean_squared_error(y_test, y_pred)
print('MSE : {}'.format(round(MSE, 2)))
RMSE = sqrt(MSE)
print('RMSE : %f' % RMSE)
R2_SCORE=r2_score(y_test, y_pred)
print('R2_SCORE : %f' % R2_SCORE)
F1_SCORE = f1_score(y_test, y_pred, average='macro')
print('F1_SCORE : %f' % F1_SCORE)
Polynomial Kernal Function - SVC ------------------------------ Accuracy : 0.7897271268057785 MAE : 0.21 MSE : 0.29 RMSE : 0.541978 R2_SCORE : 0.996969 F1_SCORE : 0.812503
#Sigmoid
sig = svm.SVC(kernel='sigmoid', C=5, gamma="scale", decision_function_shape='ovo').fit(X_train, y_train)
print('Sigmoid Function - SVC')
print('------------------------------')
# predicting the out put values for test inputs
y_pred = rbf.predict(X_test)
print('Accuracy : {}'.format(sig.score(X_test, y_test)))
MAE = mean_absolute_error(y_test, y_pred)
print('MAE : {}'.format(round(MAE, 2)))
MSE = mean_squared_error(y_test, y_pred)
print('MSE : {}'.format(round(MSE, 2)))
RMSE = sqrt(MSE)
print('RMSE : %f' % RMSE)
R2_SCORE=r2_score(y_test, y_pred)
print('R2_SCORE : %f' % R2_SCORE)
F1_SCORE = f1_score(y_test, y_pred, average='macro')
print('F1_SCORE : %f' % F1_SCORE)
Sigmoid Function - SVC ------------------------------ Accuracy : 0.28892455858747995 MAE : 0.21 MSE : 0.29 RMSE : 0.541978 R2_SCORE : 0.996969 F1_SCORE : 0.812503
# MODEL Accuracy MAE MSE RMSE R2_Score F1_SCORE
# Gaussian Naive Bayes 31% 1.63 5.96 2.440956 0.938516 0.249275
# Bernoulli Naive Bayes 22% 2.95 23.42 4.839478 0.758323 0.153571
# Logistic Regression 38% 1.16 3.83 1.958232 0.96043 0.340216
# KNN Classifier 85% 0.17 0.21 0.458555 0.99783 0.892837
# Linear SVC 68% 0.4 0.58 0.763325 0.993987 0.703947
# Radial Basis Function SVC 82% 0.21 0.29 0.541978 0.996969 0.812503
# Polynomial SVC 79% 0.21 0.29 0.541978 0.996969 0.812503
# Sigmoid SVC 29% 0.21 0.29 0.541978 0.996969 0.812503
# prefer Knn because of higher F1score , lower rmse , higher r2 score .
# Though accuracy is not a measure to choose right model. KNN stands to be the best model for the current dataset.
# To ignore warnings thrown by dataprep library
import warnings
warnings.filterwarnings('ignore')
# importing the dataprep library
import dataprep
from dataprep.eda import create_report
# generating the visual using create_report method
final_report = create_report(dataset, title='Final Dataset')
final_report
| Number of Variables | 19 |
|---|---|
| Number of Rows | 2074 |
| Missing Cells | 0 |
| Missing Cells (%) | 0.0% |
| Duplicate Rows | 0 |
| Duplicate Rows (%) | 0.0% |
| Total Size in Memory | 324.1 KB |
| Average Row Size in Memory | 160.0 B |
| Numerical | 19 |
|---|
numerical
| Distinct Count | 46 |
|---|---|
| Unique (%) | 2.2% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 68.7083 |
| Minimum | 39 |
| Maximum | 84 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 39 |
|---|---|
| 5-th Percentile | 49.65 |
| Q1 | 62 |
| Median | 71 |
| Q3 | 76 |
| 95-th Percentile | 81 |
| Maximum | 84 |
| Range | 45 |
| IQR | 14 |
| Mean | 68.7083 |
|---|---|
| Standard Deviation | 10.0026 |
| Variance | 100.0523 |
| Sum | 142501 |
| Skewness | -0.7547 |
| Kurtosis | -0.3761 |
| Coefficient of Variation | 0.1456 |
numerical
| Distinct Count | 94 |
|---|---|
| Unique (%) | 4.5% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 24.8506 |
| Minimum | 19.8 |
| Maximum | 29.1 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 19.8 |
|---|---|
| 5-th Percentile | 21.4 |
| Q1 | 23.2 |
| Median | 25.5 |
| Q3 | 26.3 |
| 95-th Percentile | 27.3 |
| Maximum | 29.1 |
| Range | 9.3 |
| IQR | 3.1 |
| Mean | 24.8506 |
|---|---|
| Standard Deviation | 1.9323 |
| Variance | 3.7337 |
| Sum | 51540.2 |
| Skewness | -0.5303 |
| Kurtosis | -0.7565 |
| Coefficient of Variation | 0.07776 |
numerical
| Distinct Count | 652 |
|---|---|
| Unique (%) | 31.4% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 35.8725 |
| Minimum | 9.3 |
| Maximum | 88.4 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 9.3 |
|---|---|
| 5-th Percentile | 11.7 |
| Q1 | 18.8 |
| Median | 29.25 |
| Q3 | 50.5 |
| 95-th Percentile | 77.7 |
| Maximum | 88.4 |
| Range | 79.1 |
| IQR | 31.7 |
| Mean | 35.8725 |
|---|---|
| Standard Deviation | 21.6236 |
| Variance | 467.5794 |
| Sum | 74399.5 |
| Skewness | 0.7696 |
| Kurtosis | -0.6317 |
| Coefficient of Variation | 0.6028 |
numerical
| Distinct Count | 1006 |
|---|---|
| Unique (%) | 48.5% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 5.2056 |
| Minimum | 0 |
| Maximum | 17.87 |
| Zeros | 25 |
| Zeros (%) | 1.2% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0 |
|---|---|
| 5-th Percentile | 0.19 |
| Q1 | 1.68 |
| Median | 4.485 |
| Q3 | 8.1 |
| 95-th Percentile | 12.1 |
| Maximum | 17.87 |
| Range | 17.87 |
| IQR | 6.42 |
| Mean | 5.2056 |
|---|---|
| Standard Deviation | 3.9504 |
| Variance | 15.6055 |
| Sum | 10796.33 |
| Skewness | 0.5139 |
| Kurtosis | -0.6963 |
| Coefficient of Variation | 0.7589 |
numerical
| Distinct Count | 219 |
|---|---|
| Unique (%) | 10.6% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 9416.6428 |
| Minimum | 100 |
| Maximum | 290000 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 100 |
|---|---|
| 5-th Percentile | 100 |
| Q1 | 500 |
| Median | 3800 |
| Q3 | 9416.6428 |
| 95-th Percentile | 42350 |
| Maximum | 290000 |
| Range | 289900 |
| IQR | 8916.6428 |
| Mean | 9416.6428 |
|---|---|
| Standard Deviation | 22287.9745 |
| Variance | 4.9675e+08 |
| Sum | 1.953e+07 |
| Skewness | 6.8164 |
| Kurtosis | 62.0487 |
| Coefficient of Variation | 2.3669 |
numerical
| Distinct Count | 445 |
|---|---|
| Unique (%) | 21.5% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 193.2985 |
| Minimum | 49 |
| Maximum | 683 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 49 |
|---|---|
| 5-th Percentile | 62.65 |
| Q1 | 101 |
| Median | 161 |
| Q3 | 255 |
| 95-th Percentile | 445.35 |
| Maximum | 683 |
| Range | 634 |
| IQR | 154 |
| Mean | 193.2985 |
|---|---|
| Standard Deviation | 119.867 |
| Variance | 14368.0976 |
| Sum | 400901 |
| Skewness | 1.2728 |
| Kurtosis | 1.4242 |
| Coefficient of Variation | 0.6201 |
numerical
| Distinct Count | 124 |
|---|---|
| Unique (%) | 6.0% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 75008.4851 |
| Minimum | 18 |
| Maximum | 2.0254e+06 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 18 |
|---|---|
| 5-th Percentile | 101 |
| Q1 | 979 |
| Median | 6704 |
| Q3 | 52218 |
| 95-th Percentile | 219826 |
| Maximum | 2.0254e+06 |
| Range | 2.0254e+06 |
| IQR | 51239 |
| Mean | 75008.4851 |
|---|---|
| Standard Deviation | 254538.4035 |
| Variance | 6.479e+10 |
| Sum | 1.5557e+08 |
| Skewness | 6.1166 |
| Kurtosis | 39.4687 |
| Coefficient of Variation | 3.3935 |
numerical
| Distinct Count | 1105 |
|---|---|
| Unique (%) | 53.3% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 6.6446 |
| Minimum | 0.01 |
| Maximum | 22.35 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0.01 |
|---|---|
| 5-th Percentile | 0.46 |
| Q1 | 1.75 |
| Median | 6.13 |
| Q3 | 10.18 |
| 95-th Percentile | 15.721 |
| Maximum | 22.35 |
| Range | 22.34 |
| IQR | 8.43 |
| Mean | 6.6446 |
|---|---|
| Standard Deviation | 5.0699 |
| Variance | 25.7043 |
| Sum | 13780.87 |
| Skewness | 0.5265 |
| Kurtosis | -0.6652 |
| Coefficient of Variation | 0.763 |
numerical
| Distinct Count | 1381 |
|---|---|
| Unique (%) | 66.6% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 11.6437 |
| Minimum | 0.26 |
| Maximum | 59.09 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0.26 |
|---|---|
| 5-th Percentile | 1.5795 |
| Q1 | 4.99 |
| Median | 8.325 |
| Q3 | 17.35 |
| 95-th Percentile | 28.1235 |
| Maximum | 59.09 |
| Range | 58.83 |
| IQR | 12.36 |
| Mean | 11.6437 |
|---|---|
| Standard Deviation | 9.2928 |
| Variance | 86.3565 |
| Sum | 24149.09 |
| Skewness | 1.5618 |
| Kurtosis | 3.4582 |
| Coefficient of Variation | 0.7981 |
numerical
| Distinct Count | 1307 |
|---|---|
| Unique (%) | 63.0% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 12.8389 |
| Minimum | 0 |
| Maximum | 64.24 |
| Zeros | 72 |
| Zeros (%) | 3.5% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0 |
|---|---|
| 5-th Percentile | 0.01 |
| Q1 | 1.145 |
| Median | 5.61 |
| Q3 | 22.34 |
| 95-th Percentile | 42.522 |
| Maximum | 64.24 |
| Range | 64.24 |
| IQR | 21.195 |
| Mean | 12.8389 |
|---|---|
| Standard Deviation | 14.9566 |
| Variance | 223.7009 |
| Sum | 26627.93 |
| Skewness | 1.1647 |
| Kurtosis | 0.2275 |
| Coefficient of Variation | 1.1649 |
numerical
| Distinct Count | 1522 |
|---|---|
| Unique (%) | 73.4% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 15.5803 |
| Minimum | 0.05 |
| Maximum | 72.74 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0.05 |
|---|---|
| 5-th Percentile | 0.8595 |
| Q1 | 4.0025 |
| Median | 13.63 |
| Q3 | 23.72 |
| 95-th Percentile | 38.1445 |
| Maximum | 72.74 |
| Range | 72.69 |
| IQR | 19.7175 |
| Mean | 15.5803 |
|---|---|
| Standard Deviation | 12.8611 |
| Variance | 165.4069 |
| Sum | 32313.6 |
| Skewness | 1.0476 |
| Kurtosis | 1.3842 |
| Coefficient of Variation | 0.8255 |
numerical
| Distinct Count | 1980 |
|---|---|
| Unique (%) | 95.5% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 116.448 |
| Minimum | 1.02 |
| Maximum | 463.91 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 1.02 |
|---|---|
| 5-th Percentile | 6.624 |
| Q1 | 31.9475 |
| Median | 98.2 |
| Q3 | 178.51 |
| 95-th Percentile | 291.8985 |
| Maximum | 463.91 |
| Range | 462.89 |
| IQR | 146.5625 |
| Mean | 116.448 |
|---|---|
| Standard Deviation | 94.9286 |
| Variance | 9011.434 |
| Sum | 241513.11 |
| Skewness | 0.7646 |
| Kurtosis | -0.237 |
| Coefficient of Variation | 0.8152 |
numerical
| Distinct Count | 1577 |
|---|---|
| Unique (%) | 76.0% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 17.395 |
| Minimum | 0.07 |
| Maximum | 191.75 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0.07 |
|---|---|
| 5-th Percentile | 1.3165 |
| Q1 | 5.1825 |
| Median | 12.16 |
| Q3 | 24 |
| 95-th Percentile | 47.607 |
| Maximum | 191.75 |
| Range | 191.68 |
| IQR | 18.8175 |
| Mean | 17.395 |
|---|---|
| Standard Deviation | 19.817 |
| Variance | 392.7147 |
| Sum | 36077.21 |
| Skewness | 3.9626 |
| Kurtosis | 25.1503 |
| Coefficient of Variation | 1.1392 |
numerical
| Distinct Count | 2074 |
|---|---|
| Unique (%) | 100.0% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 6.3738 |
| Minimum | 1.7014 |
| Maximum | 20.4134 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 1.7014 |
|---|---|
| 5-th Percentile | 2.8371 |
| Q1 | 4.4335 |
| Median | 6.1969 |
| Q3 | 8.1496 |
| 95-th Percentile | 10.3428 |
| Maximum | 20.4134 |
| Range | 18.712 |
| IQR | 3.7161 |
| Mean | 6.3738 |
|---|---|
| Standard Deviation | 2.3791 |
| Variance | 5.6603 |
| Sum | 13219.2983 |
| Skewness | 0.3744 |
| Kurtosis | 0.2049 |
| Coefficient of Variation | 0.3733 |
numerical
| Distinct Count | 78 |
|---|---|
| Unique (%) | 3.8% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 86.6432 |
| Minimum | 19 |
| Maximum | 99 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 19 |
|---|---|
| 5-th Percentile | 51 |
| Q1 | 83 |
| Median | 92 |
| Q3 | 97 |
| 95-th Percentile | 99 |
| Maximum | 99 |
| Range | 80 |
| IQR | 14 |
| Mean | 86.6432 |
|---|---|
| Standard Deviation | 14.9612 |
| Variance | 223.8369 |
| Sum | 179698 |
| Skewness | -1.9149 |
| Kurtosis | 3.5448 |
| Coefficient of Variation | 0.1727 |
numerical
| Distinct Count | 362 |
|---|---|
| Unique (%) | 17.4% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 12.6951 |
| Minimum | 0.1 |
| Maximum | 116.2 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0.1 |
|---|---|
| 5-th Percentile | 2.9 |
| Q1 | 6 |
| Median | 10.4 |
| Q3 | 15.4 |
| 95-th Percentile | 30.27 |
| Maximum | 116.2 |
| Range | 116.1 |
| IQR | 9.4 |
| Mean | 12.6951 |
|---|---|
| Standard Deviation | 10.8287 |
| Variance | 117.2611 |
| Sum | 26329.6 |
| Skewness | 3.6664 |
| Kurtosis | 22.6907 |
| Coefficient of Variation | 0.853 |
numerical
| Distinct Count | 1796 |
|---|---|
| Unique (%) | 86.6% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 595.0553 |
| Minimum | 240.4 |
| Maximum | 1317.7 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 240.4 |
|---|---|
| 5-th Percentile | 328.125 |
| Q1 | 437.325 |
| Median | 588.35 |
| Q3 | 710.125 |
| 95-th Percentile | 935.445 |
| Maximum | 1317.7 |
| Range | 1077.3 |
| IQR | 272.8 |
| Mean | 595.0553 |
|---|---|
| Standard Deviation | 194.1573 |
| Variance | 37697.0431 |
| Sum | 1.2341e+06 |
| Skewness | 0.644 |
| Kurtosis | 0.357 |
| Coefficient of Variation | 0.3263 |
numerical
| Distinct Count | 83 |
|---|---|
| Unique (%) | 4.0% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 1.4701 |
| Minimum | 0 |
| Maximum | 9.2 |
| Zeros | 31 |
| Zeros (%) | 1.5% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0 |
|---|---|
| 5-th Percentile | 0.1 |
| Q1 | 0.3 |
| Median | 0.7 |
| Q3 | 2.5 |
| 95-th Percentile | 4.4 |
| Maximum | 9.2 |
| Range | 9.2 |
| IQR | 2.2 |
| Mean | 1.4701 |
|---|---|
| Standard Deviation | 1.5467 |
| Variance | 2.3922 |
| Sum | 3048.9 |
| Skewness | 1.469 |
| Kurtosis | 2.2657 |
| Coefficient of Variation | 1.0521 |
numerical
| Distinct Count | 74 |
|---|---|
| Unique (%) | 3.6% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 32.4 KB |
| Mean | 86.7208 |
| Minimum | 8 |
| Maximum | 99 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 8 |
|---|---|
| 5-th Percentile | 53 |
| Q1 | 82 |
| Median | 93 |
| Q3 | 97 |
| 95-th Percentile | 99 |
| Maximum | 99 |
| Range | 91 |
| IQR | 15 |
| Mean | 86.7208 |
|---|---|
| Standard Deviation | 14.6434 |
| Variance | 214.43 |
| Sum | 179859 |
| Skewness | -1.8239 |
| Kurtosis | 3.3041 |
| Coefficient of Variation | 0.1689 |
#creating a new dataset with life expectancy greater than 73
countries_above73 = RAW_DATASET[RAW_DATASET['Life expectancy']>73]
# Listing the countries with high consumption of egg
countries_above73[countries_above73['Eggs Consumption']>18]['Country'].unique()
array(['China', 'Mexico', 'Netherlands', 'Japan', 'Denmark', 'Kuwait',
'Paraguay'], dtype=object)
# Listing the countries with high consumption of beef
countries_above73[countries_above73['Bovine Meat']>35]['Country'].unique()
array(['Argentina', 'Australia', 'Brazil', 'New Zealand', 'Uruguay'],
dtype=object)
# Listing the countries with high consumption of Pork
countries_above73[countries_above73['Pig Meat']>50]['Country'].unique()
array(['Poland', 'Montenegro', 'Spain', 'Germany', 'Austria',
'Luxembourg', 'Netherlands'], dtype=object)
# Listing the countries with high consumption of Poultry Meat
countries_above73[countries_above73['Poultry Meat']>50]['Country'].unique()
array(['Kuwait', 'Barbados', 'Israel', 'Jamaica', 'United Arab Emirates',
'Trinidad and Tobago'], dtype=object)
# Listing the countries with high consumption of Milk
countries_above73[countries_above73['Milk Consumption']>300]['Country'].unique()
array(['Albania', 'Netherlands', 'Montenegro', 'Sweden', 'Estonia',
'Lithuania', 'Switzerland', 'Denmark', 'Finland', 'Greece',
'Ireland', 'Luxembourg'], dtype=object)
# Listing the countries with high consumption of seafood
countries_above73[countries_above73['Fish and Seafood']>50]['Country'].unique()
array(['Malaysia', 'Maldives', 'Iceland', 'Japan', 'Norway', 'Portugal'],
dtype=object)
# Grouping protein consumption columns
# Protein_intake = Eggs Consumption + Bovine Meat + Pig Meat + Poultry Meat + Poultry Meat + Fish and Seafood
dataset["Protein_Intake"] = dataset["Eggs Consumption"] + dataset["Bovine Meat"] + dataset["Pig Meat"] + dataset["Poultry Meat"] + dataset["Poultry Meat"] + dataset["Fish and Seafood"]
dataset.head()
| Life expectancy | BMI | ChildMalnutrition | Alcohol | HIV | Adult Mortality | ChildMortality | Eggs Consumption | Bovine Meat | Pig Meat | Poultry Meat | Milk Consumption | Fish and Seafood | Medical Expenditure | Diphtheria | Suicides | NCD | Env Pollution | Polio | Protein_Intake | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 76 | 26.2 | 22.9 | 5.03 | 100.0 | 103 | 868 | 7.72 | 21.24 | 11.03 | 13.41 | 301.27 | 5.86 | 4.795327 | 99.0 | 7.6 | 490.2 | 0.5 | 99.0 | 72.67 |
| 1 | 77 | 26.3 | 23.1 | 4.43 | 100.0 | 103 | 868 | 12.69 | 22.40 | 11.04 | 12.76 | 299.85 | 4.97 | 5.055262 | 99.0 | 4.8 | 507.1 | 0.4 | 99.0 | 76.62 |
| 2 | 77 | 26.4 | 23.6 | 4.28 | 100.0 | 100 | 868 | 12.45 | 22.50 | 10.88 | 13.23 | 303.72 | 4.87 | 5.385599 | 99.0 | 4.8 | 486.4 | 0.4 | 99.0 | 77.16 |
| 3 | 75 | 25.4 | 33.9 | 0.54 | 200.0 | 98 | 60319 | 7.93 | 5.43 | 0.00 | 6.86 | 151.06 | 4.40 | 6.547214 | 95.0 | 2.8 | 464.4 | 0.8 | 95.0 | 31.48 |
| 4 | 76 | 25.5 | 33.9 | 0.55 | 200.0 | 96 | 60319 | 8.65 | 5.35 | 0.00 | 6.64 | 125.37 | 4.16 | 6.978492 | 95.0 | 2.7 | 460.7 | 0.7 | 95.0 | 31.44 |
plt.rcParams["figure.figsize"] = (15,5)
plt.scatter(dataset["Life expectancy"], dataset["Protein_Intake"], alpha=0.5)
plt.xlabel('Life expectancy')
plt.ylabel('Protein_Intake')
plt.show()